import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(0)
import math
import scipy
import scipy.stats
# print statistics of array and display array as image
def print_stat(a, str, show_image=True):
print(str, ": shape=", a.shape, "min=", np.min(a), "max=", np.max(a), "mean=", np.mean(a),
"std=", np.std(a), "rank=", np.linalg.matrix_rank(a), end=" ")
if len(a.shape) <= 1:
print()
return
if a.shape[0] == a.shape[1]:
print("det=", np.linalg.det(a))
else:
print()
if show_image:
plt.imshow(a, interpolation="none")
plt.colorbar()
plt.show()
# extract base rows of matrix
def extract_base(a, threshold):
n = len(a)
i1 = -1
for i in range(n):
b1 = a[:,i]
if np.max(np.abs(b1)) > threshold:
i1 = i
break
if i1 == -1:
print("ERROR: a is almost zero")
return None, None, None
print("extract_base(threshold=", threshold, ") : n=", n, "i1=", i1, "max=", np.max(a[:,i1]))
bindex = [i1]
b = a[:,i1][:,None]
rank = 1
for i in range(i1+1, n):
nb = a[:,i]
if np.max(np.abs(nb)) < threshold:
continue
ba = np.hstack((b, nb[:,None]))
if np.linalg.matrix_rank(ba) > rank:
bindex.append(i)
b = ba
rank += 1
bindex = np.array(bindex)
# print("rank=", rank)
# print("bindex=", bindex)
# print("b="); print(b)
return b, bindex, rank
# Load MNIST data in a format suited for tensorflow.
# The script input_data is available under this URL:
# https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/g3doc/tutorials/mnist/input_data.py
import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
n_samples = mnist.train.num_examples
# get sample data
num_all_sample = 50000
x_sample, y_sample = mnist.train.next_batch(num_all_sample)
x_sample_of_chars = []
for char in range(10):
index = np.arange(num_all_sample)[y_sample[:,char] == 1]
x_sample_of_char = x_sample[index]
x_sample_of_chars.append(x_sample_of_char)
# Calculate centers of image-clusters of 0..9
centers = []
for i in range(10):
x_sample = x_sample_of_chars[i]
center = np.average(x_sample, axis=0)
centers.append(center)
centers = np.array(centers)
print_stat(centers, "centers")
# Calculate Variance-covariance matrix for each image-cluster
# Result: A * A.T = cov (because cov is symmetric => V = U.T => UrV(UrV).T=UsU.TUsU=UrU.T = cov)
As = []
for i in range(10):
z = x_sample_of_chars[i]
n = len(z)
z = z.T
u = centers[i]
print("i=", i, " n=", n)
# Calculate Variance-covariance matrix
z0 = z - u[:, None]
# print_stat(z0, "z0")
cov = np.dot(z0, z0.T)/n
print_stat(cov, "cov", show_image=False)
# Calculate A from Variance-covariance matrix
U, s, V = np.linalg.svd(cov, full_matrices=False)
r = np.sqrt(s)
A = np.dot(np.dot(U, np.diag(r)), V)
# print_stat(A, "A")
As.append(A)
# check if A * A.T = cov
print("max(abs(A * A.T - cov))=", np.max(np.max(np.dot(A, A.T) - cov)))
print()
plt.imshow?
# Generate images from A and centers
cols = 10
plt.figure(figsize=(12, 12))
for i in range(10):
plt.subplot(10, (cols + 1), (cols + 1)*i + 1)
plt.imshow(centers[i].reshape(28, 28), cmap="gray", vmin=0.0, vmax=1.0, interpolation="none")
plt.title("center:"+str(i))
plt.xticks([])
plt.yticks([])
A = As[i]
u = centers[i][:, None]
for j in range(5):
plt.subplot(10, (cols + 1), (cols + 1)*i + j*2 + 2)
x = np.random.standard_normal((784, 1))
x = np.average(x, axis=1)
x = x[:, None]
d = np.dot(A, x) # + u
img = d.reshape((28, 28))
plt.subplot(10, (cols + 1), (cols + 1)*i + j*2 + 2)
plt.imshow(img, cmap="gray", vmin=0.0, vmax=1.0, interpolation="none")
plt.title("without-u:"+str(j))
plt.xticks([])
plt.yticks([])
d = np.dot(A, x) + u
img = d.reshape((28, 28))
plt.subplot(10, (cols + 1), (cols + 1)*i + j*2 + 1 + 2)
plt.imshow(img, cmap="gray", vmin=0.0, vmax=1.0, interpolation="none")
plt.title("with-u:"+str(j))
plt.xticks([])
plt.yticks([])
plt.tight_layout()
# Generate bigger images with colorbar from A and u
cols = 10
for i in range(10):
plt.figure(figsize=(12, 3))
plt.subplot(131)
plt.imshow(centers[i].reshape(28, 28), cmap="gray", vmin=0.0, vmax=1.0, interpolation="none")
plt.colorbar()
A = As[i]
u = centers[i][:, None]
x = np.random.standard_normal((784, 1))
x = np.average(x, axis=1)
x = x[:, None]
plt.subplot(132)
d = np.dot(A, x) # + u
img = d.reshape((28, 28))
plt.imshow(img, cmap="gray", vmin=0.0, vmax=1.0, interpolation="none")
plt.colorbar()
plt.title("without-u:")
plt.subplot(133)
d = np.dot(A, x) + u
img = d.reshape((28, 28))
plt.imshow(img, cmap="gray", vmin=0.0, vmax=1.0, interpolation="none")
plt.colorbar()
plt.title("with-u:")
plt.tight_layout()
plt.show()
# Calculate probability for each image in image-cluster using Variance-covariance matrix
covs = []
As = []
Bs = []
bindexes = []
BPIs = []
ps = []
for i in range(10):
z = x_sample_of_chars[i]
n = len(z)
z = z.T
u = centers[i]
print("i=", i, " n=", n)
z0 = z - u[:, None]
print_stat(z0, "z0")
d0 = np.sum(z0 * z0, axis=0)
d0mean = np.mean(d0)
print_stat(d0, "d0")
# Calculate Variance-covariance matrix cov
cov = np.dot(z0, z0.T)/n
print_stat(cov, "cov")
covs.append(cov)
# Calculate A s.t. A * A.T = cov
U, s, V = np.linalg.svd(cov, full_matrices=False)
r = np.sqrt(s)
A = np.dot(np.dot(U, np.diag(r)), V)
print_stat(A, "A")
As.append(A)
# check if A * A.T = cov
print("max(abs(A * A.T - cov))=", np.max(np.max(np.dot(A, A.T) - cov)))
# extract only base rows
B, index, rank = extract_base(A, 0.05)
B = B[index,:]
print_stat(B, "B")
Bs.append(B)
bindexes.append(index)
# extract only base rows
z0 = z0[index, :]
# print_stat(z0, "z0")
# calculate pseudo-inverse of B
BPI = np.linalg.pinv(B)
print_stat(BPI, "BPI")
# scale BPI to be det(BPI) = 1/d0mean , because distribution of x should be N(0, 1)
U, s, V = np.linalg.svd(BPI, full_matrices=False)
s = s / scipy.stats.mstats.gmean(s) # necessary to avoid det(BPI)=0 or inf
d0r = math.pow(1.0 / d0mean, 1.0 / len(s))
s *= d0r # very slight effect
print("d0r=", d0r)
BPI = np.dot(np.dot(U, np.diag(s)), V)
print_stat(BPI, "BPI")
BPIs.append(BPI)
x = np.dot(BPI, z0)
print_stat(x, "x")
d = np.sum(x * x, axis=0)
print_stat(d, "d")
p = scipy.stats.norm.pdf(d)
print_stat(p, "p")
ps.append(p)
# Display histgrom of log(probability) in each image-cluster
plt.figure(figsize=(8, 12))
for i in range(10):
plt.subplot(5, 2, i + 1)
p = ps[i] + np.exp(-500)
print("i=", i ,"min=", np.min(p), " max=", np.max(p))
plt.hist(np.log(p), bins=30)
# plt.hist(p)
plt.title("log(Probability) distribution of char="+str(i))
plt.tight_layout()
plt.show()
# Calculate ratio of data in cluster of which nearest center is its center
# Result: x% (x% - x%) of cluster member are nearest from the center
far_xs = []
nearest_centers = []
total_count = 0
total_ok_count = 0
for i in range(10):
xi = x_sample_of_chars[i]
n = len(xi)
z = xi.T
pi = ps[i]
i_is_nearest = np.ones(n)
nearest_center = np.ones(n, dtype=np.int) * i
nearest_p = np.copy(pi)
for j in range(10):
if i == j:
continue
u = centers[j]
z0 = z - u[:, None]
index = bindexes[j]
# extract only base rows
z0 = z0[index, :]
BPI = BPIs[j]
x = np.dot(BPI, z0)
d = np.sum(x * x, axis=0)
pj = scipy.stats.norm.pdf(d)
i_is_near = (pi > pj).astype(np.int)
i_is_nearest = i_is_nearest * i_is_near
j_is_near = nearest_p < pj
nearest_p[j_is_near] = pj[j_is_near]
nearest_center[j_is_near] = j
i_is_far = (np.ones_like(i_is_nearest) - i_is_nearest).astype(np.bool)
far_x = x_sample_of_chars[i][i_is_far]
far_xs.append(far_x)
nearest_centers.append(nearest_center[i_is_far])
ok_count = np.sum(i_is_nearest )
ok_ratio = ok_count/n
ng_count = n - ok_count
ng_ratio = ng_count/n
print("{}: ok={:.0f}({:.1f}%), ng={:.0f}({:.1f}%)".format(
i, ok_count, ok_ratio*100, ng_count, ng_ratio*100))
total_count += n
total_ok_count += ok_count
total_ok_ratio = total_ok_count / total_count
total_ng_count = total_count - total_ok_count
total_ng_ratio = total_ng_count / total_count
print("OK={:.0f}/{:.0f}({:.1f}%), NG={:.0f}/{:.0f}({:.1f}%)".format(
total_ok_count, total_count, total_ok_ratio*100, total_ng_count, total_count, total_ng_ratio*100))
# Generate images far from center
cols = 10
offset = 0
plt.figure(figsize=(12, 12))
for i in range(10):
plt.subplot(10, (cols + 1), (cols + 1)*i + 1)
plt.imshow(centers[i].reshape(28, 28), cmap="gray", interpolation="none")
plt.title("center:"+str(i))
plt.xticks([])
plt.yticks([])
len_far_i = len(far_xs[i])
print("# of far data for char {} = {}".format(i, len_far_i))
for j in range(offset, min(offset + cols, len_far_i)):
plt.subplot(10, (cols + 1), (cols + 1)*i + j + 2)
plt.imshow(far_xs[i][j].reshape(28, 28), cmap="gray", interpolation="none")
plt.title("nearest:" + str(nearest_centers[i][j]))
plt.xticks([])
plt.yticks([])
plt.tight_layout()
# get test data
num_all_test = 10000
x_test, y_test = mnist.test.next_batch(num_all_test)
x_test_of_chars = []
for char in range(10):
index = np.arange(num_all_test)[y_test[:,char] == 1]
x_test_of_char = x_test[index]
x_test_of_chars.append(x_test_of_char)
# Calculate ratio of data in cluster of which nearest center is its center
# Result: x% (x% - x%) of cluster member are nearest from the center
far_xs = []
nearest_centers = []
total_count = 0
total_ok_count = 0
for i in range(10):
xi = x_test_of_chars[i]
n = len(xi)
z = xi.T
# pi = ps[i]
u = centers[i]
z0 = z - u[:, None]
index = bindexes[i]
# extract only base rows
z0 = z0[index, :]
BPI = BPIs[i]
x = np.dot(BPI, z0)
d = np.sum(x * x, axis=0)
pi = scipy.stats.norm.pdf(d)
i_is_nearest = np.ones(n)
nearest_center = np.ones(n, dtype=np.int) * i
nearest_p = np.copy(pi)
for j in range(10):
if i == j:
continue
u = centers[j]
z0 = z - u[:, None]
index = bindexes[j]
# extract only base rows
z0 = z0[index, :]
BPI = BPIs[j]
x = np.dot(BPI, z0)
d = np.sum(x * x, axis=0)
pj = scipy.stats.norm.pdf(d)
i_is_near = (pi > pj).astype(np.int)
i_is_nearest = i_is_nearest * i_is_near
j_is_near = nearest_p < pj
nearest_p[j_is_near] = pj[j_is_near]
nearest_center[j_is_near] = j
i_is_far = (np.ones_like(i_is_nearest) - i_is_nearest).astype(np.bool)
far_x = x_test_of_chars[i][i_is_far]
far_xs.append(far_x)
nearest_centers.append(nearest_center[i_is_far])
ok_count = np.sum(i_is_nearest )
ok_ratio = ok_count/n
ng_count = n - ok_count
ng_ratio = ng_count/n
print("{}: ok={:.0f}({:.1f}%), ng={:.0f}({:.1f}%)".format(
i, ok_count, ok_ratio*100, ng_count, ng_ratio*100))
total_count += n
total_ok_count += ok_count
total_ok_ratio = total_ok_count / total_count
total_ng_count = total_count - total_ok_count
total_ng_ratio = total_ng_count / total_count
print("OK={:.0f}/{:.0f}({:.1f}%), NG={:.0f}/{:.0f}({:.1f}%)".format(
total_ok_count, total_count, total_ok_ratio*100, total_ng_count, total_count, total_ng_ratio*100))
# Generate images far from center
cols = 10
offset = 0
plt.figure(figsize=(12, 12))
for i in range(10):
plt.subplot(10, (cols + 1), (cols + 1)*i + 1)
plt.imshow(centers[i].reshape(28, 28), cmap="gray", interpolation="none")
plt.title("center:"+str(i))
plt.xticks([])
plt.yticks([])
len_far_i = len(far_xs[i])
print("# of far data for char {} = {}".format(i, len_far_i))
for j in range(offset, min(offset + cols, len_far_i)):
plt.subplot(10, (cols + 1), (cols + 1)*i + j + 2)
plt.imshow(far_xs[i][j].reshape(28, 28), cmap="gray", interpolation="none")
plt.title("nearest:" + str(nearest_centers[i][j]))
plt.xticks([])
plt.yticks([])
plt.tight_layout()
Takayoshi Iitsuka 2016/10/5 mail: iitt21-t@yahoo.co.jp